imports required to unpack datasets, load DDS (doing data science) data sets from a set of csv files
In [ ]:
import os
import zipfile
from metrique.core_api import PandasClient
def xall(path):
z = zipfile.ZipFile(os.path.expanduser(path))
z.extractall()
In [ ]:
#!mkdir ~/.metrique/repos
In [ ]:
%cd ~/.metrique/repos
Clone the metrique git repo; install metrique
In [ ]:
!git clone https://github.com/drpoovilleorg/metrique.git
Clone the oreilly doing data science sample dataset git repo; Unpack the dataset and load in
In [ ]:
!git clone https://github.com/oreillymedia/doing_data_science.git
In [ ]:
if not os.path.exists('nyt1.csv'):
xall('doing_data_science/dds_datasets.zip') # extracts various doing data science datasets
xall('dds_datasets/dds_ch2_nyt.zip') # extracts the nyt*.csv's
In [ ]:
z = PandasClient()
load up the datasets
In [25]:
# globs accepted; the single ? only samples the first
# 10 files; takes 10s+
%time nyt = z.load('./nyt?.csv')
In [26]:
%time ch5_binary = z.load('./dds_datasets/dds_ch5_binary-class-dataset.txt', sep='\t')
run pandas analysis
In [27]:
nyt.Impressions.hist()
Out[27]:
In [28]:
ch5_binary.last_sv.hist()
Out[28]:
In [28]:
In [ ]: